Data Source

File Description
Emission.csv CO2 emissions (metric tons per capita); Emissions data are sourced from Climate Watch Historical GHG Emissions ( 1990-2020 ). 2023. Washington, DC: World Resources Institute. Available online at: https://data.worldbank.org/indicator/EN.ATM.CO2E.PC.
Renewable.csv Renewable energy consumption (% of total final energy consumption); IEA, IRENA, UNSD, World Bank, WHO. 2023. Tracking SDG 7: The Energy Progress Report. World Bank, Washington DC. © World Bank. License: Creative Commons Attribution—NonCommercial 3.0 IGO ( CC BY-NC 3.0 IGO ). Available online at: https://data.worldbank.org/indicator/EG.FEC.RNEW.ZS.
GDPPCAP.csv GDP per capita (current US$); World Bank national accounts data, and OECD National Accounts data files. Available online at: https://data.worldbank.org/indicator/NY.GDP.PCAP.CD

Individual Dataset Cleansing & Analysis

# Package loading
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggcorrplot)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(plm)
## 
## Attaching package: 'plm'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead

Carbon Emission per Capita Dataset

# Data loading
df1 = read_csv("Emission.csv")
## Rows: 266 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Country Name, Country Code, Indicator Name, Indicator Code
## dbl (31): 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df1
## # A tibble: 266 × 35
##    `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1990` `1991`
##    <chr>          <chr>          <chr>            <chr>             <dbl>  <dbl>
##  1 Aruba          ABW            CO2 emissions (… EN.ATM.CO2E.PC   NA     NA    
##  2 Africa Easter… AFE            CO2 emissions (… EN.ATM.CO2E.PC    0.983  0.942
##  3 Afghanistan    AFG            CO2 emissions (… EN.ATM.CO2E.PC    0.191  0.181
##  4 Africa Wester… AFW            CO2 emissions (… EN.ATM.CO2E.PC    0.470  0.521
##  5 Angola         AGO            CO2 emissions (… EN.ATM.CO2E.PC    0.555  0.546
##  6 Albania        ALB            CO2 emissions (… EN.ATM.CO2E.PC    1.84   1.26 
##  7 Andorra        AND            CO2 emissions (… EN.ATM.CO2E.PC    7.59   7.34 
##  8 Arab World     ARB            CO2 emissions (… EN.ATM.CO2E.PC    2.80   2.75 
##  9 United Arab E… ARE            CO2 emissions (… EN.ATM.CO2E.PC   29.1   30.7  
## 10 Argentina      ARG            CO2 emissions (… EN.ATM.CO2E.PC    3.07   3.20 
## # ℹ 256 more rows
## # ℹ 29 more variables: `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>,
## #   `1996` <dbl>, `1997` <dbl>, `1998` <dbl>, `1999` <dbl>, `2000` <dbl>,
## #   `2001` <dbl>, `2002` <dbl>, `2003` <dbl>, `2004` <dbl>, `2005` <dbl>,
## #   `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>, `2010` <dbl>,
## #   `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>,
## #   `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <dbl>, `2020` <dbl>
# Rename column
names(df1)[names(df1) == "Country Name"] <- "Country_Name"
names(df1)[names(df1) == "Country Code"] <- "Country_Code"
names(df1)[names(df1) == "Indicator Name"] <- "Indicator_Name"
names(df1)[names(df1) == "Indicator Code"] <- "Indicator_Code"
# Data inspection
# Statistics function
summarize_numeric = function(dataset) {
  
  dataset = select_if(dataset, is.numeric)
  summary.table = data.frame(Attribute = names(dataset))
  
  summary.table = summary.table %>% 
    mutate("Missing Values" = apply(dataset, 2, function (x) sum(is.na(x))),
           "Unique Values" = apply(dataset, 2, function (x) length(unique(x))),
           "Mean" = colMeans(dataset, na.rm = TRUE),
           "Min" = apply(dataset, 2, function (x) min(x, na.rm = TRUE)),
           "Max" = apply(dataset, 2, function (x) max(x, na.rm = TRUE)),
           "SD" = apply(dataset, 2, function (x) sd(x, na.rm = TRUE))
    )
  summary.table
}

# Statistics display
summarize_numeric(df1)
##    Attribute Missing Values Unique Values     Mean        Min      Max       SD
## 1       1990             28           233 4.178043 0.00000000 29.55349 5.305961
## 2       1991             27           234 4.074045 0.00000000 32.82686 5.275075
## 3       1992             27           238 4.037818 0.00000000 31.14326 5.106431
## 4       1993             27           238 3.991411 0.00000000 34.18354 5.163018
## 5       1994             27           238 3.973801 0.00000000 36.93155 5.261851
## 6       1995             27           238 3.965076 0.00000000 36.97628 5.147698
## 7       1996             27           238 4.035097 0.00000000 39.56827 5.263985
## 8       1997             27           238 4.067742 0.00000000 46.11881 5.412366
## 9       1998             27           238 4.058579 0.00000000 45.61590 5.350864
## 10      1999             27           238 4.039246 0.00000000 47.28894 5.382174
## 11      2000             27           238 4.049275 0.00000000 44.37925 5.329736
## 12      2001             27           238 4.110546 0.00000000 42.20579 5.341719
## 13      2002             27           238 4.130324 0.03194220 45.56469 5.456114
## 14      2003             27           238 4.254188 0.02495319 46.41689 5.582623
## 15      2004             27           238 4.322976 0.02241417 47.65696 5.650412
## 16      2005             27           238 4.346139 0.02178952 45.40609 5.619383
## 17      2006             27           238 4.411055 0.02501897 43.28857 5.586445
## 18      2007             27           238 4.409894 0.02417992 40.60945 5.408679
## 19      2008             27           238 4.378026 0.02400307 36.88995 5.271242
## 20      2009             27           238 4.170807 0.02274563 33.72730 4.927421
## 21      2010             27           238 4.304683 0.03542391 35.54827 5.069194
## 22      2011             27           238 4.306717 0.03799811 37.97949 5.080191
## 23      2012             27           238 4.306728 0.03718042 39.58214 5.073588
## 24      2013             27           238 4.274821 0.02511230 37.60288 4.994218
## 25      2014             27           238 4.186761 0.02717383 37.10503 4.855477
## 26      2015             27           238 4.122951 0.03419362 35.29042 4.714661
## 27      2016             27           238 4.105521 0.02973242 33.54957 4.626153
## 28      2017             27           238 4.108484 0.03375521 32.25664 4.544475
## 29      2018             27           238 4.102909 0.03231124 31.48097 4.472633
## 30      2019             27           238 4.044442 0.03371488 31.87720 4.411878
## 31      2020             27           238 3.767934 0.03258478 31.72684 4.315248
# Replace the missing value with the interpolate method
# Function to interpolate missing values
interpolate_missing <- function(x) {
  first_non_na <- which(!is.na(x))[1]  # Find first non-NA value
  last_non_na <- tail(which(!is.na(x)), 1)  # Find last non-NA value
  interpolated_values <- approx(
    x = c(first_non_na, last_non_na),
    y = x[c(first_non_na, last_non_na)],
    xout = which(is.na(x))
  )$y  # Interpolate missing values
  replace(x, which(is.na(x)), interpolated_values)  # Replace missing values
}

# Apply the function across relevant columns
df1 <- df1 %>%
  mutate(across(matches("^19\\d{2}$"), interpolate_missing))
# Drop remaining missing values
df1 <- df1 %>%
  drop_na()

# Check missing value again
sum(is.na(df1))
## [1] 0
# Data transformation for analysis purpose
df1_long <- df1 %>% pivot_longer(cols = -c("Country_Name", "Country_Code", "Indicator_Name", "Indicator_Code"), 
                                    names_to = "Year", 
                                    values_to = "CO2E_PC")
df1_long <- df1_long %>%
  select(-Indicator_Name, -Indicator_Code)
df1_long
## # A tibble: 7,409 × 4
##    Country_Name                Country_Code Year  CO2E_PC
##    <chr>                       <chr>        <chr>   <dbl>
##  1 Africa Eastern and Southern AFE          1990    0.983
##  2 Africa Eastern and Southern AFE          1991    0.942
##  3 Africa Eastern and Southern AFE          1992    0.908
##  4 Africa Eastern and Southern AFE          1993    0.910
##  5 Africa Eastern and Southern AFE          1994    0.913
##  6 Africa Eastern and Southern AFE          1995    0.933
##  7 Africa Eastern and Southern AFE          1996    0.943
##  8 Africa Eastern and Southern AFE          1997    0.962
##  9 Africa Eastern and Southern AFE          1998    0.963
## 10 Africa Eastern and Southern AFE          1999    0.902
## # ℹ 7,399 more rows
# Finding the two countries with the largest CO2 emissions over time and the two countries with the least CO2 emissions over time
# Aggregate data by country to find total emissions over time
country_total_emissions <- df1_long %>%
  group_by(Country_Name) %>%
  summarise(total_emissions = sum(CO2E_PC, na.rm = TRUE)) %>%
  arrange(desc(total_emissions))

# Select top 2 and bottom 2 countries based on total emissions
top_countries <- head(country_total_emissions, 2)
top_countries
## # A tibble: 2 × 2
##   Country_Name         total_emissions
##   <chr>                          <dbl>
## 1 Qatar                          1189.
## 2 United Arab Emirates            773.
bottom_countries <- tail(country_total_emissions, 2)
bottom_countries
## # A tibble: 2 × 2
##   Country_Name     total_emissions
##   <chr>                      <dbl>
## 1 Congo, Dem. Rep.            1.34
## 2 Burundi                     1.13

# Interactive heat map showing the world CO2 emission in 2020
world_map_2020 <- df1_long %>%
  filter(Year == "2020")

zmin <- min(world_map_2020$CO2E_PC, na.rm = TRUE)
zmax <- max(world_map_2020$CO2E_PC, na.rm = TRUE)

map <- plot_geo() %>%
  add_trace(
    data = world_map_2020,
    type = 'choropleth',
    locations = ~Country_Name,
    locationmode = "country names",
    z = ~CO2E_PC,
    colorscale = "Viridis",
    colorbar = list(title = "CO2 Emissions"),
    hoverinfo = "text",
    text = ~paste(" Country: ", Country_Name, "<br>",
                  "CO2 Emissions: ", CO2E_PC, "<br>",
                  "Year: ", Year),
    zmin = zmin,
    zmax = zmax
  ) %>%
  layout(
    title = "World CO2 Emissions per Capita in 2020",
    yaxis = list(title = "CO2 Emissions")
  )

map
# The country with the maximum CO2 emission per capita in 2020
max_index <- which.max(world_map_2020$CO2E_PC)
world_map_2020$Country_Name[max_index]
## [1] "Qatar"
# The country with the minimum CO2 emission per capita in 2020
min_index <- which.min(world_map_2020$CO2E_PC)
world_map_2020$Country_Name[min_index]
## [1] "Congo, Dem. Rep."

Renewable Energy Consumption Dataset

# Data loading
df2 = read_csv("Renewable.csv")
## Rows: 266 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Country Name, Country Code, Indicator Name, Indicator Code
## dbl (31): 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2
## # A tibble: 266 × 35
##    `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1990` `1991`
##    <chr>          <chr>          <chr>            <chr>             <dbl>  <dbl>
##  1 Aruba          ABW            Renewable energ… EG.FEC.RNEW.ZS     0.27   0.23
##  2 Africa Easter… AFE            Renewable energ… EG.FEC.RNEW.ZS    60.9   62.2 
##  3 Afghanistan    AFG            Renewable energ… EG.FEC.RNEW.ZS    23     23.7 
##  4 Africa Wester… AFW            Renewable energ… EG.FEC.RNEW.ZS    85.9   85.3 
##  5 Angola         AGO            Renewable energ… EG.FEC.RNEW.ZS    72.3   71.9 
##  6 Albania        ALB            Renewable energ… EG.FEC.RNEW.ZS    25.5   33.0 
##  7 Andorra        AND            Renewable energ… EG.FEC.RNEW.ZS    14.0   14.0 
##  8 Arab World     ARB            Renewable energ… EG.FEC.RNEW.ZS     7.44   7.35
##  9 United Arab E… ARE            Renewable energ… EG.FEC.RNEW.ZS     0      0   
## 10 Argentina      ARG            Renewable energ… EG.FEC.RNEW.ZS     8.57   8.42
## # ℹ 256 more rows
## # ℹ 29 more variables: `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>,
## #   `1996` <dbl>, `1997` <dbl>, `1998` <dbl>, `1999` <dbl>, `2000` <dbl>,
## #   `2001` <dbl>, `2002` <dbl>, `2003` <dbl>, `2004` <dbl>, `2005` <dbl>,
## #   `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>, `2010` <dbl>,
## #   `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>,
## #   `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <dbl>, `2020` <dbl>
# Rename column
names(df2)[names(df2) == "Country Name"] <- "Country_Name"
names(df2)[names(df2) == "Country Code"] <- "Country_Code"
names(df2)[names(df2) == "Indicator Name"] <- "Indicator_Name"
names(df2)[names(df2) == "Indicator Code"] <- "Indicator_Code"
# Statistics display
summarize_numeric(df2)
##    Attribute Missing Values Unique Values     Mean Min   Max       SD
## 1       1990             12           227 32.39952   0 96.78 32.29705
## 2       1991             11           232 32.46059   0 96.79 32.25048
## 3       1992             10           232 32.86831   0 97.51 32.13679
## 4       1993             10           230 33.00395   0 96.95 31.99526
## 5       1994             10           237 33.03609   0 97.45 31.84773
## 6       1995              9           238 33.07554   0 96.90 31.70366
## 7       1996              9           236 32.75749   0 96.96 31.42452
## 8       1997              9           232 32.38152   0 97.06 31.20138
## 9       1998              9           234 32.20223   0 96.85 31.05204
## 10      1999              9           238 32.15999   0 96.99 30.93266
## 11      2000              8           237 31.95186   0 97.94 30.64193
## 12      2001              8           240 31.52932   0 98.34 30.56227
## 13      2002              8           241 31.63757   0 98.27 30.44880
## 14      2003              8           238 31.31144   0 97.97 30.19756
## 15      2004              8           241 31.05973   0 97.88 30.14808
## 16      2005              7           243 31.01494   0 97.42 30.15404
## 17      2006              7           241 30.69940   0 97.33 30.09696
## 18      2007              7           240 30.23537   0 97.17 29.77602
## 19      2008              7           239 30.27962   0 96.97 29.75985
## 20      2009              7           244 30.36141   0 97.02 29.51935
## 21      2010              7           243 30.11391   0 96.81 29.10630
## 22      2011              7           247 29.86616   0 96.21 28.81567
## 23      2012              6           249 29.74946   0 95.47 28.43671
## 24      2013              6           245 29.96092   0 95.08 28.21813
## 25      2014              6           244 29.73732   0 94.82 28.08773
## 26      2015              6           247 29.53938   0 95.82 27.88875
## 27      2016              6           251 29.34641   0 97.03 27.61110
## 28      2017              6           250 29.16183   0 96.70 27.38423
## 29      2018              6           250 29.25110   0 96.38 27.20149
## 30      2019              6           251 29.15088   0 96.29 26.95235
## 31      2020              6           249 30.18518   0 96.16 27.14194
# Replace the missing value with the interpolate method
# Function to interpolate missing values
interpolate_missing <- function(x) {
  first_non_na <- which(!is.na(x))[1]  # Find first non-NA value
  last_non_na <- tail(which(!is.na(x)), 1)  # Find last non-NA value
  interpolated_values <- approx(
    x = c(first_non_na, last_non_na),
    y = x[c(first_non_na, last_non_na)],
    xout = which(is.na(x))
  )$y  # Interpolate missing values
  replace(x, which(is.na(x)), interpolated_values)  # Replace missing values
}

# Apply the function across relevant columns
df2 <- df2 %>%
  mutate(across(matches("^19\\d{2}$"), interpolate_missing))
# Drop remaining missing values
df2 <- df2 %>%
  drop_na()

# Check missing value again
sum(is.na(df2))
## [1] 0
# Data transformation for analysis purpose
df2_long <- df2 %>% pivot_longer(cols = -c("Country_Name", "Country_Code", "Indicator_Name", "Indicator_Code"), 
                                    names_to = "Year", 
                                    values_to = "RENEW_C")
df2_long <- df2_long %>%
  select(-Indicator_Name, -Indicator_Code)
df2_long
## # A tibble: 7,998 × 4
##    Country_Name Country_Code Year  RENEW_C
##    <chr>        <chr>        <chr>   <dbl>
##  1 Aruba        ABW          1990     0.27
##  2 Aruba        ABW          1991     0.23
##  3 Aruba        ABW          1992     0.24
##  4 Aruba        ABW          1993     0.2 
##  5 Aruba        ABW          1994     0.18
##  6 Aruba        ABW          1995     0.17
##  7 Aruba        ABW          1996     0.17
##  8 Aruba        ABW          1997     0.16
##  9 Aruba        ABW          1998     0.16
## 10 Aruba        ABW          1999     0.16
## # ℹ 7,988 more rows
# Finding the two countries with the largest renewable energy consumption% over time and the two countries with the least renewable energy consumption% over time
# Aggregate data by country to find mean consumption% over time
country_mean_conspt <- df2_long %>%
  group_by(Country_Name) %>%
  summarise(total_conspt = mean(RENEW_C, na.rm = TRUE)) %>%
  arrange(desc(total_conspt))

# Select top 2 and bottom 2 countries based on mean consumption%
top_countries <- head(country_mean_conspt, 2)
top_countries
## # A tibble: 2 × 2
##   Country_Name     total_conspt
##   <chr>                   <dbl>
## 1 Congo, Dem. Rep.         96.5
## 2 Uganda                   94.2
bottom_countries <- tail(country_mean_conspt, 2)
bottom_countries
## # A tibble: 2 × 2
##   Country_Name total_conspt
##   <chr>               <dbl>
## 1 Bahrain                 0
## 2 Gibraltar               0

# Interactive heat map showing the world renewable energy consumption% in 2020
world_map_2020_renew <- df2_long %>%
  filter(Year == "2020")

zmin <- min(world_map_2020_renew$RENEW_C, na.rm = TRUE)
zmax <- max(world_map_2020_renew$RENEW_C, na.rm = TRUE)

map <- plot_geo() %>%
  add_trace(
    data = world_map_2020_renew,
    type = 'choropleth',
    locations = ~Country_Name,
    locationmode = "country names",
    z = ~RENEW_C,
    colorscale = "Viridis",
    colorbar = list(title = "RENEW_C %"),
    hoverinfo = "text",
    text = ~paste(" Country: ", Country_Name, "<br>",
                  "RENEW_C %: ", RENEW_C, "<br>",
                  "Year: ", Year),
    zmin = zmin,
    zmax = zmax
  ) %>%
  layout(
    title = "World Renewable Energy Consumption% in 2020",
    yaxis = list(title = "RENEW_C %")
  )

map
# The country with the maximum Renewable Energy Consumption% in 2020
max_index <- which.max(world_map_2020_renew$RENEW_C)
world_map_2020_renew$Country_Name[max_index]
## [1] "Congo, Dem. Rep."
# The country with the minimum Renewable Energy Consumption% in 2020
min_index <- which.min(world_map_2020_renew$RENEW_C)
world_map_2020_renew$Country_Name[min_index]
## [1] "Bahrain"

Research Question 1: Do countries with higher shares of renewable energy consumption tend to have lower carbon dioxide emissions per capita?

# Merge the two data frames
merged_data1 <- inner_join(df1_long, df2_long, by = c("Country_Name", "Country_Code", "Year"))
# Method 1
# Panel data analysis - fixed effect
# Convert Year to a factor
merged_data1$Year <- as.factor(merged_data1$Year)

# Fit a fixed effects model
model1 <- plm(CO2E_PC ~ RENEW_C, data = merged_data1, index = c("Country_Name", "Country_Code", "Year"), model = "within")
## Warning in pdata.frame(data, index): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Summarize the model
summary(model1)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = CO2E_PC ~ RENEW_C, data = merged_data1, model = "within", 
##     index = c("Country_Name", "Country_Code", "Year"))
## 
## Unbalanced Panel: n = 237, T = 31-31, N = 7347
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -17.664793  -0.300330   0.008147   0.282583   9.292777 
## 
## Coefficients:
##          Estimate Std. Error t-value  Pr(>|t|)    
## RENEW_C -0.054637   0.002008  -27.21 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    8980.5
## Residual Sum of Squares: 8133.4
## R-Squared:      0.094323
## Adj. R-Squared: 0.06413
## F-statistic: 740.379 on 1 and 7109 DF, p-value: < 2.22e-16

The coefficient estimate for RENEW_C is -0.054637. This means that, on average, a one-unit increase in the share of renewable energy consumption is associated with a decrease of approximately 0.054637 units in CO2 emissions per capita. The coefficient estimate for RENEW_C is statistically significant, as indicated by the p-value (< 2.2e-16), which is much smaller than the conventional significance level of 0.05. This suggests strong evidence against the null hypothesis that the true coefficient is zero.

# Method 2
# Calculate compound growth rate for CO2 emissions
aggregated_data1 <- merged_data1 %>%
  arrange(Country_Name, Country_Code, Year) %>%
  group_by(Country_Name, Country_Code) %>%
  summarize(compound_growth_CO2 = ifelse(first(CO2E_PC) == 0, NA, 
                                         (last(CO2E_PC) / first(CO2E_PC))^(1 / (n() - 1)) - 1))
## `summarise()` has grouped output by 'Country_Name'. You can override using the
## `.groups` argument.
# Calculate compound growth rate for renewable energy consumption
aggregated_data2 <- merged_data1 %>%
  arrange(Country_Name, Country_Code, Year) %>%
  group_by(Country_Name, Country_Code) %>%
  summarize(compound_growth_RENEW = ifelse(first(RENEW_C) == 0, NA, 
                                         (last(RENEW_C) / first(RENEW_C))^(1 / (n() - 1)) - 1))
## `summarise()` has grouped output by 'Country_Name'. You can override using the
## `.groups` argument.
# Merge
merged_data2 <- inner_join(aggregated_data1, aggregated_data2, by = c("Country_Name", "Country_Code"))
merged_data2
## # A tibble: 237 × 4
## # Groups:   Country_Name [237]
##    Country_Name           Country_Code compound_growth_CO2 compound_growth_RENEW
##    <chr>                  <chr>                      <dbl>                 <dbl>
##  1 Afghanistan            AFG                     0.00518               -0.00892
##  2 Africa Eastern and So… AFE                    -0.00703                0.00274
##  3 Africa Western and Ce… AFW                    -0.000497              -0.00404
##  4 Albania                ALB                    -0.00589                0.0188 
##  5 Algeria                DZA                     0.0138                -0.00606
##  6 Andorra                AND                    -0.00907                0.0148 
##  7 Angola                 AGO                     0.00220               -0.00562
##  8 Antigua and Barbuda    ATG                     0.0133                NA      
##  9 Arab World             ARB                     0.0114                -0.0123 
## 10 Argentina              ARG                     0.00343                0.00462
## # ℹ 227 more rows
# Drop remaining missing values
merged_data2 <- merged_data2 %>%
  drop_na()

# Fit a linear regression model
model2 <- lm(compound_growth_CO2 ~ compound_growth_RENEW, data = merged_data2)

summary(model2)
## 
## Call:
## lm(formula = compound_growth_CO2 ~ compound_growth_RENEW, data = merged_data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.047443 -0.011088 -0.002286  0.009472  0.091955 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.008289   0.001360   6.095 4.81e-09 ***
## compound_growth_RENEW -0.442450   0.041353 -10.699  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02017 on 221 degrees of freedom
## Multiple R-squared:  0.3412, Adjusted R-squared:  0.3383 
## F-statistic: 114.5 on 1 and 221 DF,  p-value: < 2.2e-16

The estimated change in compound_growth_CO2 for a one-unit increase in compound_growth_RENEW. Here, it’s -0.442450, indicating that, on average, a one-unit increase in compound growth in renewable energy consumption is associated with a decrease of 0.442450 units in compound growth in CO2 emissions. The p-value associated with compound_growth_RENEW is very low (p < 2e-16), indicating that compound_growth_RENEW is significantly associated with compound_growth_CO2. It indicates that higher growth in renewable energy consumption associated with lower growth in CO2 emissions.

Both methods suggest similar conclusion of the relationship between the renewable energy consumption and the CO2 emissions per capita. However, the adjusted R-squared of the second model is higher than the first model.

Research Question 2: Are there notable exceptions or outliers where countries have high renewable energy consumption growth rate but still high carbon emissions growth rate?

# Calculate residuals
merged_data2$residuals <- residuals(model2)

# Identify outliers
outliers <- merged_data2 %>%
  filter(abs(residuals) > 2 * sd(residuals)) %>%
  arrange(desc(residuals))

outliers
## # A tibble: 0 × 5
## # Groups:   Country_Name [0]
## # ℹ 5 variables: Country_Name <chr>, Country_Code <chr>,
## #   compound_growth_CO2 <dbl>, compound_growth_RENEW <dbl>, residuals <dbl>

# Identify the above-average countries
high_renew_high_co2_countries <- merged_data2 %>%
  filter(compound_growth_RENEW > mean(merged_data2$compound_growth_RENEW) & compound_growth_CO2 > mean(merged_data2$compound_growth_CO2))

high_renew_high_co2_countries
## # A tibble: 6 × 5
## # Groups:   Country_Name [6]
##   Country_Name Country_Code compound_growth_CO2 compound_growth_RENEW residuals
##   <chr>        <chr>                      <dbl>                 <dbl>     <dbl>
## 1 Cambodia     KHM                      0.0720                0.0183    0.0718 
## 2 Grenada      GRD                      0.0276                0.00726   0.0225 
## 3 Korea, Rep.  KOR                      0.0217                0.0270    0.0253 
## 4 Saudi Arabia SAU                      0.00961               0.0136    0.00734
## 5 Tonga        TON                      0.00923               0.00981   0.00528
## 6 Uruguay      URY                      0.0145                0.0104    0.0108

Based on the standard deviation calculation, there are no outliers where countries exhibit both high renewable energy consumption growth rate and high carbon emissions growth rate. However, the above six countries stand out, as they have relatively high growth in percentages of renewable energy consumption, yet their growth in CO2 emissions per capita remain above average.

Possible reasons could be:

Korea, Rep. (KOR) and Saudi Arabia (SAU) are known for their rapid economic development and industrialization, which can contribute to increased energy demand and higher CO2 emissions. Uruguay (URY) has experienced significant economic growth in recent years, which may lead to higher energy consumption and emissions despite efforts to increase renewable energy usage.

Tonga (TON) may rely heavily on fossil fuels for energy generation due to limited access to renewable energy sources, resulting in higher CO2 emissions despite efforts to increase renewable energy consumption.

Cambodia (KHM) has experienced rapid population growth, which can contribute to increased energy demand and higher CO2 emissions even if renewable energy consumption is also increasing.

Grenada (GRD) may lack comprehensive energy efficiency measures, resulting in lower overall energy efficiency and higher CO2 emissions despite efforts to increase renewable energy usage.

Research Question 3: Is there a relationship between a country’s GDP per capita and its consumption of clean energy technologies?

# Data loading
df3 = read_csv("GDPPCAP.csv")
## Rows: 266 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Country Name, Country Code, Indicator Name, Indicator Code
## dbl (63): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df3
## # A tibble: 266 × 67
##    `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
##    <chr>          <chr>          <chr>            <chr>             <dbl>  <dbl>
##  1 Aruba          ABW            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
##  2 Africa Easter… AFE            GDP per capita … NY.GDP.PCAP.CD    141.   144. 
##  3 Afghanistan    AFG            GDP per capita … NY.GDP.PCAP.CD     62.4   62.4
##  4 Africa Wester… AFW            GDP per capita … NY.GDP.PCAP.CD    107.   112. 
##  5 Angola         AGO            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
##  6 Albania        ALB            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
##  7 Andorra        AND            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
##  8 Arab World     ARB            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
##  9 United Arab E… ARE            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
## 10 Argentina      ARG            GDP per capita … NY.GDP.PCAP.CD     NA     NA  
## # ℹ 256 more rows
## # ℹ 61 more variables: `1962` <dbl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## #   `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## #   `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## #   `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## #   `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## #   `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, …
# Rename column
names(df3)[names(df3) == "Country Name"] <- "Country_Name"
names(df3)[names(df3) == "Country Code"] <- "Country_Code"
names(df3)[names(df3) == "Indicator Name"] <- "Indicator_Name"
names(df3)[names(df3) == "Indicator Code"] <- "Indicator_Code"
# Statistics display
summarize_numeric(df3)
##    Attribute Missing Values Unique Values       Mean       Min        Max
## 1       1960            140           125   483.9389  25.09581   3007.123
## 2       1961            139           126   503.7171  27.27463   3066.563
## 3       1962            138           127   522.3165  27.95318   3243.843
## 4       1963            138           127   555.8441  25.80701   3374.515
## 5       1964            138           127   604.7388  17.33183   3573.941
## 6       1965            126           139   671.7889  15.11323   4081.424
## 7       1966            123           142   728.8942  11.79268   4228.745
## 8       1967            120           145   747.4988  16.52353   4336.427
## 9       1968            115           150   762.4532  21.50030   4695.923
## 10      1969            115           150   824.6320  21.45031   5032.145
## 11      1970            106           159   998.4128  20.65508  12077.764
## 12      1971            103           162  1104.0939  21.04422  13315.978
## 13      1972            103           162  1278.8398  23.19327  16169.870
## 14      1973            103           162  1596.5624  24.67841  20857.065
## 15      1974            102           163  2009.6350  41.19362  22324.014
## 16      1975            100           165  2365.9705  34.96895  27894.490
## 17      1976             99           166  2562.4168  38.93492  31282.772
## 18      1977             96           169  2810.9347  27.68471  35186.232
## 19      1978             97           168  3192.1873  29.06296  37681.438
## 20      1979             96           169  3791.6126  29.01321  45087.208
## 21      1980             83           182  4316.1537  31.02349  50900.263
## 22      1981             79           186  4125.7524  32.57102  44841.230
## 23      1982             77           188  3933.0886  42.63256  41461.446
## 24      1983             75           190  3774.1776  39.00077  39040.928
## 25      1984             73           192  3703.8561  36.06386  36516.690
## 26      1985             71           194  3686.3714  40.09944  37590.430
## 27      1986             69           196  4277.5543  42.12870  51943.188
## 28      1987             65           200  4898.9385  40.86630  62341.745
## 29      1988             59           206  5221.6986  39.64901  67133.238
## 30      1989             55           210  5283.5981  50.98703  66829.039
## 31      1990             40           225  5848.8560  52.74856  81813.020
## 32      1991             41           224  5942.3264  22.85037  81158.907
## 33      1992             38           227  6220.3505  30.11432  88882.550
## 34      1993             36           229  6015.9435  53.48024  82983.959
## 35      1994             34           231  6338.3860 104.68965  87042.272
## 36      1995             25           240  7127.4039 123.34776  99431.739
## 37      1996             25           240  7353.6146 130.81654  98927.677
## 38      1997             25           240  7227.9067 107.39297  88844.945
## 39      1998             24           241  7398.1265 123.83022  91110.977
## 40      1999             23           242  7702.5941  99.75725  89742.503
## 41      2000             19           246  7632.2255 122.96166  81763.828
## 42      2001             17           248  7614.2511 119.26186  82410.401
## 43      2002             12           253  8185.8386 110.46087  90151.546
## 44      2003             12           253  9417.3117 114.36701 111309.870
## 45      2004             12           253 10733.2596 128.53842 125435.894
## 46      2005             12           253 11631.8353 151.18854 130818.982
## 47      2006             11           254 12997.6194 166.27625 143289.073
## 48      2007             11           254 14811.9377 170.70688 184639.750
## 49      2008             10           255 15882.8823 194.71063 204097.114
## 50      2009             10           255 14184.4879 204.54476 168957.108
## 51      2010              9           256 14871.7205 222.66058 161780.745
## 52      2011              6           259 16373.5283 236.45135 179369.268
## 53      2012              8           257 16270.3160 238.20595 165497.098
## 54      2013              7           258 16786.4422 241.54767 185055.518
## 55      2014              6           259 17112.8975 257.81856 195772.724
## 56      2015              8           257 15457.5206 289.35963 170338.680
## 57      2016              8           257 15770.9738 242.53953 174412.495
## 58      2017              8           257 16572.9478 244.14542 173611.688
## 59      2018              8           257 17546.6914 232.06062 193968.090
## 60      2019              7           258 17435.4735 216.97297 199382.839
## 61      2020              8           257 16205.9172 216.82742 182537.387
## 62      2021              9           256 18431.1873 221.15780 235132.784
## 63      2022             22           243 18165.3255 259.02503 240862.182
##            SD
## 1    629.4091
## 2    653.2762
## 3    685.8917
## 4    728.9522
## 5    794.9367
## 6    873.9175
## 7    948.1006
## 8    981.5538
## 9   1009.0817
## 10  1087.9262
## 11  1492.5078
## 12  1644.6624
## 13  1943.1199
## 14  2444.9352
## 15  2987.2663
## 16  3869.5509
## 17  4224.9806
## 18  4626.9535
## 19  5031.1947
## 20  5987.8118
## 21  6940.9029
## 22  6508.7939
## 23  6010.7179
## 24  5596.5573
## 25  5428.7164
## 26  5432.4595
## 27  6589.2305
## 28  7871.7470
## 29  8486.3589
## 30  8493.9435
## 31  9810.9151
## 32  9986.0033
## 33 10662.6170
## 34 10175.2473
## 35 10877.8941
## 36 12334.5101
## 37 12504.9732
## 38 11770.1105
## 39 12196.9518
## 40 12725.9914
## 41 12279.2741
## 42 12213.5887
## 43 13072.0247
## 44 15225.1748
## 45 17224.0553
## 46 18281.0131
## 47 20422.0208
## 48 23566.9481
## 49 25164.5488
## 50 22015.2073
## 51 22507.7690
## 52 24592.2122
## 53 23886.3107
## 54 25387.8162
## 55 25960.0977
## 56 23391.0808
## 57 23696.0583
## 58 24498.6077
## 59 26078.3372
## 60 25785.8207
## 61 24484.4790
## 62 28575.5901
## 63 27195.3694
# Replace the missing value with the interpolate method
# Function to interpolate missing values
interpolate_missing <- function(x) {
  first_non_na <- which(!is.na(x))[1]  # Find first non-NA value
  last_non_na <- tail(which(!is.na(x)), 1)  # Find last non-NA value
  interpolated_values <- approx(
    x = c(first_non_na, last_non_na),
    y = x[c(first_non_na, last_non_na)],
    xout = which(is.na(x))
  )$y  # Interpolate missing values
  replace(x, which(is.na(x)), interpolated_values)  # Replace missing values
}

# Apply the function across relevant columns
df3 <- df3 %>%
  mutate(across(matches("^19\\d{2}$"), interpolate_missing))
# Drop remaining missing values
df3 <- df3 %>%
  drop_na()

# Check missing value again
sum(is.na(df3))
## [1] 0
# Data transformation for analysis purpose
df3_long <- df3 %>% pivot_longer(cols = -c("Country_Name", "Country_Code", "Indicator_Name", "Indicator_Code"), 
                                    names_to = "Year", 
                                    values_to = "GDP_PC")
df3_long <- df3_long %>%
  select(-Indicator_Name, -Indicator_Code)
df3_long
## # A tibble: 14,742 × 4
##    Country_Name                Country_Code Year  GDP_PC
##    <chr>                       <chr>        <chr>  <dbl>
##  1 Africa Eastern and Southern AFE          1960    141.
##  2 Africa Eastern and Southern AFE          1961    144.
##  3 Africa Eastern and Southern AFE          1962    149.
##  4 Africa Eastern and Southern AFE          1963    157.
##  5 Africa Eastern and Southern AFE          1964    167.
##  6 Africa Eastern and Southern AFE          1965    178.
##  7 Africa Eastern and Southern AFE          1966    187.
##  8 Africa Eastern and Southern AFE          1967    200.
##  9 Africa Eastern and Southern AFE          1968    210.
## 10 Africa Eastern and Southern AFE          1969    230.
## # ℹ 14,732 more rows
# Merge the GDP per capita data
merged_data3 <- inner_join(merged_data1, df3_long, by = c("Country_Name", "Country_Code", "Year"))
merged_data3
## # A tibble: 6,944 × 6
##    Country_Name                Country_Code Year  CO2E_PC RENEW_C GDP_PC
##    <chr>                       <chr>        <chr>   <dbl>   <dbl>  <dbl>
##  1 Africa Eastern and Southern AFE          1990    0.983    60.9   822.
##  2 Africa Eastern and Southern AFE          1991    0.942    62.2   865.
##  3 Africa Eastern and Southern AFE          1992    0.908    64.0   735.
##  4 Africa Eastern and Southern AFE          1993    0.910    64.7   710.
##  5 Africa Eastern and Southern AFE          1994    0.913    65.2   700.
##  6 Africa Eastern and Southern AFE          1995    0.933    64.8   766.
##  7 Africa Eastern and Southern AFE          1996    0.943    64.0   741.
##  8 Africa Eastern and Southern AFE          1997    0.962    63.3   763.
##  9 Africa Eastern and Southern AFE          1998    0.963    64.0   700.
## 10 Africa Eastern and Southern AFE          1999    0.902    65.1   674.
## # ℹ 6,934 more rows
# Method 1
# Panel data analysis - fixed effect
# Convert Year to a factor
merged_data3$Year <- as.factor(merged_data3$Year)

# Fit a fixed effects model
model3 <- plm(RENEW_C ~ CO2E_PC + GDP_PC, data = merged_data3, index = c("Country_Name", "Country_Code", "Year"), model = "within")
## Warning in pdata.frame(data, index): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Summarize the model
summary(model3)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = RENEW_C ~ CO2E_PC + GDP_PC, data = merged_data3, 
##     model = "within", index = c("Country_Name", "Country_Code", 
##         "Year"))
## 
## Unbalanced Panel: n = 224, T = 31-31, N = 6944
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -35.25295  -2.28341  -0.10909   2.19208  48.00568 
## 
## Coefficients:
##            Estimate  Std. Error  t-value Pr(>|t|)    
## CO2E_PC -1.8205e+00  6.6234e-02 -27.4860   <2e-16 ***
## GDP_PC   1.8281e-05  1.1738e-05   1.5574   0.1194    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    269930
## Residual Sum of Squares: 242010
## R-Squared:      0.10342
## Adj. R-Squared: 0.073392
## F-statistic: 387.46 on 2 and 6718 DF, p-value: < 2.22e-16

The coefficient estimate for GDP_PC is approximately 1.8281e-05, indicating that a one-unit increase in GDP per capita is associated with an increase of approximately 1.8281e-05 units in renewable energy consumption, holding other variables constant. This suggests that higher GDP per capita tends to be associated with a slight increase in renewable energy consumption. However, with a p-value of 0.1194, the coefficient estimate for GDP_PC is not statistically significant at conventional significance levels (such as α = 0.05). Therefore, we do not have sufficient evidence to conclude that the relationship between GDP per capita and renewable energy consumption is statistically significant in this regression model.

# Method 2
# Calculate compound growth rate for GDP per capita
aggregated_data3 <- merged_data3 %>%
  arrange(Country_Name, Country_Code, Year) %>%
  group_by(Country_Name, Country_Code) %>%
  summarize(compound_growth_GDPPC = ifelse(first(GDP_PC) == 0, NA, 
                                         (last(GDP_PC) / first(GDP_PC))^(1 / (n() - 1)) - 1))
## `summarise()` has grouped output by 'Country_Name'. You can override using the
## `.groups` argument.
# Merge
merged_data2 <- inner_join(merged_data2, aggregated_data3, by = c("Country_Name", "Country_Code"))
merged_data2
## # A tibble: 213 × 6
## # Groups:   Country_Name [213]
##    Country_Name Country_Code compound_growth_CO2 compound_growth_RENEW residuals
##    <chr>        <chr>                      <dbl>                 <dbl>     <dbl>
##  1 Africa East… AFE                    -0.00703                0.00274  -0.0141 
##  2 Africa West… AFW                    -0.000497              -0.00404  -0.0106 
##  3 Albania      ALB                    -0.00589                0.0188   -0.00586
##  4 Algeria      DZA                     0.0138                -0.00606   0.00281
##  5 Andorra      AND                    -0.00907                0.0148   -0.0108 
##  6 Angola       AGO                     0.00220               -0.00562  -0.00858
##  7 Arab World   ARB                     0.0114                -0.0123   -0.00235
##  8 Argentina    ARG                     0.00343                0.00462  -0.00282
##  9 Armenia      ARM                    -0.0277                 0.0469   -0.0152 
## 10 Australia    AUS                    -0.00146                0.0103   -0.00519
## # ℹ 203 more rows
## # ℹ 1 more variable: compound_growth_GDPPC <dbl>
# Drop remaining missing values
merged_data2 <- merged_data2 %>%
  drop_na()

# Fit a linear regression model
model4 <- lm(compound_growth_CO2 ~ compound_growth_RENEW + compound_growth_GDPPC, data = merged_data2)

summary(model4)
## 
## Call:
## lm(formula = compound_growth_CO2 ~ compound_growth_RENEW + compound_growth_GDPPC, 
##     data = merged_data2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.040456 -0.011207 -0.001928  0.008334  0.090328 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            9.001e-05  2.248e-03   0.040    0.968    
## compound_growth_RENEW -4.091e-01  3.935e-02 -10.398  < 2e-16 ***
## compound_growth_GDPPC  2.345e-01  4.929e-02   4.758 3.64e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01877 on 210 degrees of freedom
## Multiple R-squared:  0.4233, Adjusted R-squared:  0.4178 
## F-statistic: 77.06 on 2 and 210 DF,  p-value: < 2.2e-16

After transforming GDP_PC into CAGR_GDP_PC, the coefficient is 0.2345, indicating that for every unit increase in compound_growth_GDPPC, compound_growth_CO2 is estimated to increase by 0.2345 units. This effect is also highly significant (p = 3.64e-06). Additionally, approximately 42.33% of the variability in compound_growth_CO2 is explained by the predictors in this model. Therefore, this model is overall significant and provides a better estimation of the relationships.